In [2]:
# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.optimize
# Numerical differentiation packages
import numdifftools as ndt
# Import pyplot for plotting
import matplotlib.pyplot as plt
# Seaborn, useful for graphics
import seaborn as sns
# Bokeh stuff
import bokeh.charts
import bokeh.io
# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline
# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
# Display graphics in this notebook
bokeh.io.output_notebook()
In [4]:
#load data
df= pd.read_csv('../input/invitro_droplet_data.csv', comment= '#')
df.head()
Out[4]:
In [7]:
#compute spindle volume
spindle_volume= np.pi*df['Spindle Length (um)']\
*df['Spindle Width (um)']**2/6.0*1e-9
#compute the ratio v_s/v_0
vol_ratio= spindle_volume/ df['Droplet Volume (uL)']
n, b, p= plt.hist(vol_ratio, bins= int(np.sqrt(len(vol_ratio))))
plt.xlabel(r'$V_\mathrm{s} / V_0$')
plt.ylabel('count')
Out[7]:
In [8]:
#compute aspect ratio, k
k= df['Spindle Width (um)']/ df['Spindle Length (um)']
#Plot histogram of results
n, b, p= plt.hist(k, bins= int(np.sqrt(len(k))))
plt.xlabel(r'$k')
plt.ylabel('count')
Out[8]:
In [10]:
#take a quick look at the data
p= bokeh.charts.Scatter(df, x= 'Droplet Diameter (um)', \
y= 'Spindle Length (um)')
bokeh.io.show(p)
In [11]:
#estimate the spindle length assuming constant spindle length
#i.e., each spindle is of length theta plus/minus gaussian error
theta= df['Spindle Length (um)'].mean()
r= df['Spindle Length (um)'].std()
n= df['Spindle Length (um)'].count()
# Print mean plus minus a stanrdard error of the mean
print("""
Model a results (≈68% of total posterior probability)
-----------------------------------------------------
θ = {0:.1f} ± {1:.1f} µm
""".format(theta, r / np.sqrt(n)))
In [14]:
plt.plot(df['Droplet Diameter (um)'], df['Spindle Length (um)'], '.'\
, markersize= 10, alpha= 0.25)
plt.plot([0,250], [theta, theta], '-', color= sns.color_palette()[2])
plt.xlabel(u'droplet diamete (µm)')
plt.ylabel(u'spindle length (µm)')
Out[14]:
Model b assumes that spindle size is dependent on total tubulin concentration $$l(d) = \frac{\gamma d}{(1+(\frac{\gamma d}{\theta})^3)^\frac{1}{3}}$$
Use a uniform prior for $$\gamma$$ and $$\theta$$ and a jeffrey prior for the data variance $$\sigma$$
In [35]:
def spindle_length(p, d):
"""
Theoretical model for spindle length
"""
theta, gamma= p
return gamma*d/np.cbrt(1+(gamma*d/theta)**3)
#post marginalization over the variance sigma
def log_post(p, d, ell):
"""
Compute log of posterior for single set of parameters
p[0]= theta
p[1]= gamma
"""
#unpack parameters
theta, gamma= p
#theoretical length
ell_theor= spindle_length(p, d)
return -len(d)/2*np.log(np.sum((ell - ell_theor)**2))
#parameter values to plot
th= np.linspace(37, 40, 100)
gamma= np.linspace(0.8, 0.95, 100)
#make a grid
tt, gg= np.meshgrid(th, gamma)
#compute log posterior
log_posterior= np.empty_like(tt)
for j in range(len(th)):
for i in range(len(gamma)):
p= np.array([tt[i, j], gg[i, j]]) #parameters to unpack
log_posterior[i, j]= log_post(p, df['Droplet Diameter (um)'],\
df['Spindle Length (um)'])
#scale
log_posterior -= log_posterior.max()
plt.contourf(tt, gg, np.exp(log_posterior), cmap= plt.cm.Blues,\
alpha= 0.7)
plt.xlabel(r'$\theta$ (µm)')
plt.ylabel(r'$\gamma$')
Out[35]:
The a posteriori estimate is the set of parameter values that maximize the posterior probability. This can be done by minimizing the residuals for the model, since the sum of the residuals are inversely proportional to the posterior probability of the parameters
In [37]:
def resid(p, d, ell):
"""
Residuals for spindle length
"""
return ell - spindle_length(p,d)
In [38]:
#Initial guess
p0= np.array([40, 0.6])
In [41]:
#place extra args as tuple
args= (df['Droplet Diameter (um)'], df['Spindle Length (um)'])
#compute map
popt, _= scipy.optimize.leastsq(resid, p0, args= args)
#extract vals
theta, gamma= popt
# Print results
print("""
Model 2b, most probable parameters
----------------------------------
θ = {0:.1f} µm
γ = {1:.2f}
""".format(theta, gamma))
In [43]:
#plot
#values of droplet diameter
d_plot= np.linspace(0, 250, 200)
#theoretical curve
spindle_theor= spindle_length(popt, d_plot)
#plot
plt.plot(df['Droplet Diameter (um)'], df['Spindle Length (um)'],\
'.', markersize= 10, alpha= 0.25)
plt.plot(d_plot, spindle_theor, '-', color= sns.color_palette()[2])
# Use unicode to do plot label with microns with nicely formatted mu
plt.xlabel(u'droplet diameter (µm)')
plt.ylabel(u'spindle length (µm)')
Out[43]:
In [46]:
# Instantiate Hessian for log posterior
hes_fun = ndt.Hessian(log_post)
# Compute Hessian at MAP
hes = hes_fun(popt, df['Droplet Diameter (um)'],\
df['Spindle Length (um)'])
# Compute the covariance matrix
cov = -np.linalg.inv(hes)
# Look at it
cov
# Report results
print("""
Results for Model b (≈ 68% of total probability)
------------------------------------------------
θ = {0:.1f} ± {1:.1f} µm
γ = {2:.2f} ± {3:.2f}
""".format(theta, np.sqrt(cov[0,0]), gamma, np.sqrt(cov[1,1])))